# R Options
options(stringsAsFactors=FALSE, "citation_format"="pandoc")
# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(reticulate) # required for 'leiden' clustering
library(enrichR) # functional enrichment
library(future) # multicore support for Seurat
plan("multiprocess") # by default, this uses all available cores; use "workers=4" for example, to use 4 cores
# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr
# Tables: kableExtra
# Plots: ggsci, ggpubr
# IO: openxlsx
# Annotation: biomaRt
# DEG: mast
# Functional enrichment: enrichR
# Source plotting functions
source("R/functions_io.R")
source("R/functions_plotting.R")
source("R/functions_analysis.R")
source("R/functions_util.R")
# Knitr default options
knitr::opts_chunk$set(echo=TRUE, cache=FALSE, message=FALSE, warning=FALSE, fig.width=10)
# Potentially needed for clustering, umap, other python packages
use_python('/usr/bin/python')
# biomaRt mirror
mirror="asia"This code chunk contains all parameters that are set specifically for your project.
param = list()
# Project ID
param$project = "pbmc"
# Input data path in case Cell Ranger was run
param$path_data = data.frame(name="pbmc",
type="10x",
path="test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/counts/")
param$file_mapping_stats = "test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/pbmc_1k_v3_metrics_summary.csv"
# Project-specific paths
param$path_out = "test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/results"
if (!file.exists(param$path_out)) dir.create(param$path_out, recursive=TRUE)
# Annotation via biomaRt
param$mart_dataset = "hsapiens_gene_ensembl"
param$file_annot = NULL
if (is.null(param$file_annot)) {
param$file_annot = file.path(param$path_out, paste0(param$mart_dataset, ".annot.csv"))
}
param$mart_attributes = c("ensembl_gene_id", "external_gene_name", "hgnc_symbol", "entrezgene_accession", "chromosome_name", "start_position", "end_position", "percentage_gene_gc_content", "gene_biotype", "strand", "description")
# Prefix of mitochondrial genes
param$mt = "^MT-"
# The number of PCs to use; adjust this parameter based on JackStraw and Elbowplot
param$pc_n = 10
# Whether or not to remove cell cycle effects
param$cc_remove = FALSE
# Should all cell cycle effects be removed, or only the difference between profilerating cells (G2M and S phase)?
# Read https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html, for an explanation
param$cc_remove_all = FALSE
# Resolution of clusters; low values will lead to fewer clusters of cells
param$cluster_resolution=0.5
# Thresholds to define differentially expressed genes
param$padj = 0.05
param$log2fc = log2(1.5)
# Marker genes based on literature
# https://icb-scanpy-tutorials.readthedocs-hosted.com/en/latest/visualizing-marker-genes.html
param$known_markers = list()
param$known_markers[["bcell"]] = c("CD79A", "MS4A1")
param$known_markers[["tcell"]] = "CD3D"
param$known_markers[["tcell.cd8+"]] = c("CD8A", "CD8B")
param$known_markers[["nk"]] = c("GNLY", "NKG7")
param$known_markers[["myeloid"]] = c("CST3", "LYZ")
param$known_markers[["monocytes"]] = "FCGR3A"
param$known_markers[["dendritic"]] = "FCER1A"
# Enrichr databases of interest
param$enrichr_dbs = c("GO_Molecular_Function_2018", "GO_Biological_Process_2018", "GO_Cellular_Component_2018")
# Main colour(s) to use for plots
param$col = "palevioletred"
# The number of colours provided should match the number of datasets
if (nrow(param$path_data) > 1) {
param$col_samples = ggsci::pal_npg()(nrow(param$path_data))
names(param$col_samples) = param$path_data$name
} else {
param$col_samples = param$col
}We begin by printing mapping statistics that have been produced prior to this workflow.
if (!is.null(param$file_mapping_stats)) {
mapping_stats = as.data.frame(t(read.delim(param$file_mapping_stats, sep=",", header=TRUE, check.names=FALSE)))
colnames(mapping_stats) = "Value"
knitr::kable(mapping_stats, align="l", caption="Mapping statistics") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
} else {
message("Mapping statistics cannot be shown. No valid file provided.")
}| Value | |
|---|---|
| Estimated Number of Cells | 1,222 |
| Mean Reads per Cell | 54,502 |
| Median Genes per Cell | 1,919 |
| Number of Reads | 66,601,887 |
| Valid Barcodes | 97.4% |
| Sequencing Saturation | 70.8% |
| Q30 Bases in Barcode | 94.1% |
| Q30 Bases in RNA Read | 90.2% |
| Q30 Bases in Sample Index | 91.1% |
| Q30 Bases in UMI | 92.7% |
| Reads Mapped to Genome | 95.4% |
| Reads Mapped Confidently to Genome | 92.4% |
| Reads Mapped Confidently to Intergenic Regions | 4.8% |
| Reads Mapped Confidently to Intronic Regions | 31.1% |
| Reads Mapped Confidently to Exonic Regions | 56.5% |
| Reads Mapped Confidently to Transcriptome | 53.7% |
| Reads Mapped Antisense to Gene | 1.0% |
| Fraction Reads in Cells | 94.9% |
| Total Genes Detected | 18,391 |
| Median UMI Counts per Cell | 6,628 |
We read gene annotation from Ensembl, and translate Ensembl IDs to Entrez gene symbols. The resulting table is written to file.
# Read annotation from csv or from Ensembl and a tab separated csv will be created
if (file.exists(param$file_annot)) {
annot_ensembl = read.delim(param$file_annot)
# Double-check whether all required columns are included
if (any(!c("ensembl_gene_id", "external_gene_name","entrezgene_accession") %in% colnames(annot_ensembl))) {
stop("The annotation table misses at least one of the following columns: 'ensembl_gene_id', 'external_gene_name','entrezgene_accession'.")
}
} else {
mart_attributes = unique(c("ensembl_gene_id", "external_gene_name","entrezgene_accession", param$mart_attributes))
annot_mart = biomaRt::useEnsembl("ensembl", dataset=param$mart_dataset, mirror=mirror)
annot_ensembl = biomaRt::getBM(mart=annot_mart, attributes=mart_attributes)
write.table(annot_ensembl, file=param$file_annot, sep='\t', col.names=TRUE, row.names=FALSE, append=FALSE)
message("Gene annotation file was created at: ", param$file_annot)
# Note: depending on the attributes, there might be more than one row per gene
}
# Create translation tables
# Ensembl id to gene symbol
ensembl_to_symbol = unique(annot_ensembl[,c("ensembl_gene_id","external_gene_name")])
ensembl_to_symbol = setNames(ensembl_to_symbol$external_gene_name, ensembl_to_symbol$ensembl_gene_id)
# Ensembl id to seurat-compatible unique rowname
ensembl_to_seurat_rowname = unique(annot_ensembl[,c("ensembl_gene_id", "external_gene_name")])
ensembl_to_seurat_rowname$external_gene_name = make.unique(gsub(pattern="_", replacement="-", x=ensembl_to_seurat_rowname$external_gene_name, fixed=T))
ensembl_to_seurat_rowname = setNames(ensembl_to_seurat_rowname$external_gene_name, ensembl_to_seurat_rowname$ensembl_gene_id)
# Seurat-compatible unique rowname to ensembl id
seurat_rowname_to_ensembl = setNames(names(ensembl_to_seurat_rowname), ensembl_to_seurat_rowname)
# Gene symbol to ensembl id: named LIST to account for genes where one symbol translates to multiple Ensembl IDs
symbol_to_ensembl_df = unique(annot_ensembl[,c("ensembl_gene_id","external_gene_name")])
symbol_to_ensembl = split(symbol_to_ensembl_df$ensembl_gene_id, symbol_to_ensembl_df$external_gene_name)
# Gene symbol to (seurat compatible unique) gene symbol: named LIST to account for genes with multiple names
symbol_to_seurat_rowname = unique(annot_ensembl[,c("ensembl_gene_id","external_gene_name")])
symbol_to_seurat_rowname$seurat_rowname = ensembl_to_seurat_rowname[symbol_to_seurat_rowname$ensembl_gene_id]
symbol_to_seurat_rowname = split(symbol_to_seurat_rowname$seurat_rowname, symbol_to_seurat_rowname$external_gene_name)
# Ensembl to entrez
ensembl_to_entrez = unique(annot_ensembl[, c("ensembl_gene_id", "entrezgene_accession")])
ensembl_to_entrez$entrezgene_accession = ifelse(nchar(ensembl_to_entrez$entrezgene_accession) == 0, NA, ensembl_to_entrez$entrezgene_accession)
ensembl_to_entrez = split(ensembl_to_entrez$entrezgene_accession, ensembl_to_entrez$ensembl_gene_id)
# Seurat-compatible unique rowname to entrez
seurat_rowname_to_ensembl_match = match(seurat_rowname_to_ensembl,names(ensembl_to_entrez))
names(seurat_rowname_to_ensembl_match) = names(seurat_rowname_to_ensembl)
seurat_rowname_to_entrez = purrr::map(seurat_rowname_to_ensembl_match, function(i) {unname(ensembl_to_entrez[[i]])})We next read the scRNA-seq counts table(s) to initialise a Seurat object.
# List of Seurat objects
sc = list()
datasets = param$path_data
for (i in seq(nrow(datasets))) {
name = datasets[i,"name"]
type = datasets[i,"type"]
path = datasets[i,"path"]
# Read 10X or smartseq2
if (type == "10x") {
# Read 10X data into a Seurat object
sc[[name]] = Read10xDataset(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname)
} else if (type == "smartseq2") {
# Read SmartSeq-2 data into a Seurat object
# Note: drop unneccessary columns, e.g. columns 2-6 for featureCounts results
sc[[name]] = ReadSmartseq2Dataset(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname)
# Parse plate information: sample name, plate number, plate row and plate column
plate_information = parse_plate_information(Seurat::Cells(sc[[name]]),
pattern='^(\\S+)_(\\d+)_([A-Z])(\\d+)$',
sample_name_group=1,
plate_number_group=2,
row_name_group=3,
col_name_group=4)
if (nrow(plate_information) > 0) {
# Add info to metadata
sc[[name]] = Seurat::AddMetaData(sc[[name]], plate_information)
# Also set identity
sc[[name]] = Seurat::SetIdent(sc[[name]], value="SampleName")
}
}
}
sc## $pbmc
## An object of class Seurat
## 33538 features across 1222 samples within 1 assay
## Active assay: RNA (33538 features, 0 variable features)
We start the analysis by removing unwanted cells from the dataset. Three commonly used QC metrics include the number of unique genes detected in each cell (“nFeature”), the total number of molecules detected in each cell (“nCount”), and the percentage of reads that map to the mitochrondrial genome (“percent.mt”).
# Calculate percentage of reads that map to mitochondrial genome
sc = PercentageFeatureSet(sc, pattern=param$mt, col.name="percent.mt")
# Metadata
knitr::kable(head(sc@meta.data, 5), align="l", caption="Meta-data, top 5 rows") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%")| orig.ident | nCount_RNA | nFeature_RNA | percent.mt | |
|---|---|---|---|---|
| AAACCCAAGGAGAGTA-1 | pbmc | 8288 | 2620 | 10.774614 |
| AAACGCTTCAGCCCAG-1 | pbmc | 5512 | 1808 | 7.964441 |
| AAAGAACAGACGACTG-1 | pbmc | 4283 | 1562 | 6.187252 |
| AAAGAACCAATGGCAG-1 | pbmc | 2754 | 1225 | 5.991285 |
| AAAGAACGTCTGCAAT-1 | pbmc | 6592 | 1831 | 6.614078 |
# Filter cutoffs
cells_filter = c(nFeature_RNA=200, nCount_RNA=NA, percent.mt=20)
features_filter = c(min_counts=1, min_cells=1) # feature has to be found by at least one count in one cell
# Plot QC metrics
features = c("nFeature_RNA", "nCount_RNA", "percent.mt")
p = VlnPlot(sc, features=features, pt.size=0, col=param$col_samples, combine=FALSE)
names(p) = features
for (i in features) {
p[[i]] = PlotMystyle(p[[i]], title=i, legend_position="none") + xlab("")
if (!is.na(cells_filter[i])) p[[i]] = p[[i]] + geom_hline(yintercept=cells_filter[i], lty=2, col="darkgrey")
}
p = patchwork::wrap_plots(p, ncol = 3) + patchwork::plot_annotation("Distribution of feature values")
p # Correlate QC metrics
p = list()
p[[1]] = Seurat::FeatureScatter(sc, feature1=features[2], feature2=features[1], cols=param$col_samples)
p[[2]] = Seurat::FeatureScatter(sc, feature1=features[2], feature2=features[3], cols=param$col_samples)
for (i in 1:length(p)) p[[i]] = PlotMystyle(p[[i]], legend_position="none")
p = patchwork::wrap_plots(p, ncol = 2) + patchwork::plot_annotation("Features plotted against each other")
p# Filter cells
sc = subset(sc, subset=nFeature_RNA>cells_filter["nFeature_RNA"] & percent.mt<cells_filter["percent.mt"])
# Filter features
features_count_sum = Matrix::rowSums(Seurat::GetAssayData(sc, slot="counts", assay="RNA") >= features_filter["min_counts"])
features_keep = which(features_count_sum > features_filter["min_cells"])
sc = subset(sc, features=features_keep)
sc## An object of class Seurat
## 16220 features across 1113 samples within 1 assay
## Active assay: RNA (16220 features, 0 variable features)
In this section, we subsequently run a series of Seurat functions:
1. We start by running a standard log normalisation, where counts for each cell are divided by the total counts for that cell and multiplied by 10,000. This is then natural-log transformed.
2. To be able to compare normalised gene counts between genes, gene counts are further scaled to have zero mean and unit variance (z-score). This way, genes are equally weighted for downstream analysis.
3. Variable features are selected. For downstream analysis it is beneficial to focus on genes that exhibit high cell-to-cell variation, that is they are highly expressed in some cells and lowly expressed in others.
In addition, we run:
4. SCTransform, a new and more sophisticated normalisation method that replaces the previous functions (normalisation, scaling and variable features). Note that although results are kept for steps 1-3 (“RNA” assay), downstream analyses will default to resulting data of step 4 (“SCT” assay).
If specified in the above parameter section, cell cycle scores are assigned to each cell based on its normalised expression of G2/M and S phase markers, after basic normalisation (step 1). Cell cycle effects are then removed during scaling (step 4). Done for this report: FALSE.
# This function often crashes with multicore support, so stop for a moment
future::plan("sequential")
# Normalise data the original way
sc = Seurat::NormalizeData(sc, normalization.method = "LogNormalize", scale.factor = 10000)
# Back to multicore support
future::plan("multicore")if(param$cc_remove) {
# Use biomart to translate human cell cycle genes to the species of interest
mart_human = biomaRt::useEnsembl("ensembl", dataset="hsapiens_gene_ensembl", mirror=mirror)
mart_myspecies = biomaRt::useEnsembl("ensembl", dataset=param$mart_dataset, mirror=mirror)
genes_s = biomaRt::getLDS(attributes=c("external_gene_name"),
filters="external_gene_name",
values=cc.genes.updated.2019$s.genes,
mart=mart_human,
attributesL=c("external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
genes_g2m = biomaRt::getLDS(attributes=c("external_gene_name"),
filters="external_gene_name",
values=cc.genes.updated.2019$g2m.genes,
mart=mart_human,
attributesL=c("external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
# Determine cell cycle effect
sc = Seurat::CellCycleScoring(object=sc, s.features=genes_s[,2], g2m.features=genes_g2m[,2], set.ident=FALSE)
# Determine difference between G2M and S phase scores
sc$CC.Difference = sc$S.Score - sc$G2M.Score
}# Scaling "RNA" normalised counts is required to run PCA, which again is required for the JackStraw plot
# Note that we do NOT remove cell cycle effects in "RNA" scaled data, since this is very slow and
# only used for the JackStraw plot
sc = Seurat::ScaleData(sc, features=VariableFeatures(sc))
# Probably obsolete, not needed anywhere in this report
# Select features from normalised data (unaffected by scaling)
sc = Seurat::FindVariableFeatures(sc, selection.method = "vst", nfeatures = 3000)# Run sctransform
if (param$cc_remove) {
# Regress out cell cycle effect
if(param$cc_remove_all) {
# Remove all signal associated to cell cycle
sc = SCTransform(sc, vars.to.regress=c("S.Score", "G2M.Score"))
} else {
# Remove only the difference between G2M and S phase
sc = SCTransform(sc, vars.to.regress="CC.Difference")
}
} else {
sc = SCTransform(sc)
}
# "SCT" is now the default assay (active.assay)
# This is a new normalisation method that replaces previous Seurat functions 'NormalizeData', 'FindVariableFeatures', and 'ScaleData'.
# vignette: https://satijalab.org/seurat/v3.0/sctransform_vignette.html
# paper: https://www.biorxiv.org/content/10.1101/576827v2
# Normalised data end up here: sc@assays$SCT@data
# Note: If I write Seurat::SCTransform, I get the following error message in the next chunk for
# Seurat::VariableFeaturePlot(sc, cols=c("grey", param$col))
# 'vst' is Unable to find highly variable feature information for method 'vst'# Show variable genes
top10 = head(Seurat::VariableFeatures(sc), 10)
# Plot variable features with and without labels
p1 = Seurat::VariableFeaturePlot(sc, cols=c("grey", param$col))
p1 = PlotMystyle(p1)
p2 = LabelPoints(plot=p1, points=top10, repel=TRUE)
p = p1 + p2 & theme(legend.position="bottom")
p = p + patchwork::plot_annotation("Variable genes without (left) and with (right) labels") +
patchwork::plot_layout(guides = "collect")
pn_cells_plot = min(200, length(Seurat::Cells(sc)))
cells_random = sample(1:length(Seurat::Cells(sc)), n_cells_plot)
raw = GetAssayData(sc, assay="RNA", slot="counts") %>% as.matrix()
raw = raw[, cells_random]
norm1 = GetAssayData(sc, assay="RNA", slot="data") %>% as.matrix()
norm1 = norm1[, cells_random]
norm2 = GetAssayData(sc, assay="SCT", slot="data") %>% as.matrix()
norm2 = norm2[, cells_random]To better understand the efficiency of the applied normalisation procedures, we plot the relative log expression of genes in 200 randomly selected cells before and after normalisation. This type of plot reveals unwanted variation in your data. The concept is taken from Gandolfo and Speed (2018). In brief, we remove variation between genes, leaving only variation between samples. If expression levels of most genes are similar in all cell types, sample heterogeneity is a sign of unwanted variation.
For each gene, we calculate its median expression across all cells, and then calculate the deviation from this median for each cell. For each cell, we plot the median expression (black), the interquartile range (lightgrey), whiskers defined as 1.5 times the interquartile range (darkgrey), and outliers (palevioletred).
A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. The biological manifold however can be described by far fewer dimensions than the number of genes. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarise a dataset, and to visualise a dataset.
We use Principal Component Analysis (PCA) to summarise a dataset, overcoming noise and reducing the data to its essential components. Each principal component (PC) represents a “metafeature” that combines information across a correlated gene set. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualise the dataset, placing similar cells together in 2D space, see below.
To decide how many PCs to include in downstream analyses, we visualize cells and genes that define the PCA.
# Run PCA for "RNA" assay; this is required for the JackStraw plot
sc = Seurat::RunPCA(sc, features=Seurat::VariableFeatures(object=sc), assay="RNA")
# Run PCA for the default "SCT" assay
sc = Seurat::RunPCA(sc, features=Seurat::VariableFeatures(object=sc))
p = Seurat::VizDimLoadings(sc, dims=1:2, reduction="pca", col=param$col, combine=FALSE)
for (i in 1:length(p)) p[[i]] = PlotMystyle(p[[i]])
p = patchwork::wrap_plots(p, ncol = 2) + patchwork::plot_annotation("Top gene loadings of the first two PCs")
pp = Seurat::DimPlot(sc, reduction="pca", cols=param$col_samples)
p = PlotMystyle(p, title="Cells arranged by the first two PCs", legend_position="bottom")
pWe next need to decide how many PCs we want to use for downstream analyses. The following two plots are designed to help us make an informed decision.
The first plot is based on the “JackStraw” procedure: parts of the data is repeatedly randomly permuted and PCA is rerun, generating a “null distribution” of feature scores. Significant PCs are those with a strong enrichment of low p-value features.
The second plot is an “Elbow plot”: PCs are ranked based on the percentage of variance they explain.
For your dataset, we decided to go for 10 PCs.
# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time
sc = Seurat::JackStraw(sc, num.replicate=100, dims=20, assay="RNA")
sc = Seurat::ScoreJackStraw(sc, dims=1:20)
p = Seurat::JackStrawPlot(sc, dims=1:20)
p = PlotMystyle(p, title="Jack Straw plot", legend_position="bottom")
pSeurat’s clustering method first constructs a graph structure, where nodes are cells and edges are drawn between cells with similar gene expression patterns. Technically speaking, Seurat first constructs a K-nearest neighbor (KNN) graph based on Euclidean distance in PCA space, and refines edge weights between cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To partition the graph into highly interconnected parts, cells are iteratively grouped together using the Leiden algorithm.
# Note: I changed the seed in ./lib/python3.6/site-packages/leidenalg/functions.py to 11 for reproducibility
# The number of clusters can be optimized by tuning 'resolution' -> based on feedback from the client whether or not clusters make sense
# Choose the number of PCs to use for clustering
sc = Seurat::FindNeighbors(sc, dims=1:param$pc_n)
# Cluster using the Leiden algorithm
# Paper to Leiden algorithm: https://www.nature.com/articles/s41598-019-41695-z
# Seurat vignette suggests resolution parameter between 0.4-1.2 for datasets of about 3k cells
sc = Seurat::FindClusters(sc, resolution=param$cluster_resolution, algorithm=4)We use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.
Take care not to mis-read a UMAP:
For a nice read to intuitively understand UMAP, see (“Understanding Umap” 2019).
sc = Seurat::RunUMAP(sc, dims=1:param$pc_n)
# Note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters
cluster_cells = table(sc@active.ident)
cluster_labels = paste0(levels(sc@active.ident)," (", cluster_cells[levels(sc@active.ident)],")")
p = Seurat::DimPlot(sc, reduction="umap", label=TRUE) + scale_colour_discrete("Cluster", labels=cluster_labels)
p = PlotMystyle(p, "UMAP, cells coloured by cluster identity", legend_position="bottom")
pHow much do gene expression profiles in your dataset reflect the cell cycle phases the single cells were in? We determine the effects of cell cycle heterogeneity by calculating a score for each cell based on its expression of G2M and S phase markers. Scoring is based on the strategy described in Tirosh et al. (2016), and human gene names are translated using biomaRt.
# If not done in a previous chunk, get cell cycle genes translated
if(!param$cc_remove) {
# Use biomart to translate human cell cycle genes to the species of interest
mart_human = biomaRt::useEnsembl("ensembl", dataset="hsapiens_gene_ensembl", mirror=mirror)
mart_myspecies = biomaRt::useEnsembl("ensembl", dataset=param$mart_dataset, mirror=mirror)
genes_s = biomaRt::getLDS(attributes=c("external_gene_name"),
filters="external_gene_name",
values=cc.genes.updated.2019$s.genes,
mart=mart_human,
attributesL=c("external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
genes_g2m = biomaRt::getLDS(attributes=c("external_gene_name"),
filters="external_gene_name",
values=cc.genes.updated.2019$g2m.genes,
mart=mart_human,
attributesL=c("external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
}
# Determine cell cycle effect, this time based on default assay "SCT"
sc = Seurat::CellCycleScoring(object=sc, s.features=genes_s[,2], g2m.features=genes_g2m[,2], set.ident=FALSE)
# Get a feeling for how many cells are affected
p1 = ggplot(sc@meta.data, aes(x=S.Score, y=G2M.Score, colour=Phase)) +
geom_point() +
scale_x_continuous("G1/S score") +
scale_y_continuous("G2/M score")
p1 = PlotMystyle(p1)
p2 = ggplot(sc@meta.data %>%
dplyr::group_by(seurat_clusters,Phase) %>%
dplyr::summarise(num_reads=length(Phase)),
aes(x=seurat_clusters, y=num_reads, fill=Phase)) +
geom_bar(stat="identity", position="fill") +
scale_x_discrete("Seurat clusters") +
scale_y_continuous("Fraction of cells")
p2 = PlotMystyle(p2)
p = p1 + p2 & theme(legend.position="bottom")
p = p + patchwork::plot_annotation(title="Cell cycle phases")
p# UMAP with phases superimposed
p = Seurat::DimPlot(sc, group.by="Phase", pt.size=1)
PlotMystyle(p, title="UMAP, cells coloured by cell cycle phases", legend_title="Phase")p1 = Seurat::FeaturePlot(sc, features="S.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col))
p1 = PlotMystyle(p1, title="UMAP, cells coloured by S phase")
p2 = Seurat::FeaturePlot(sc, features="G2M.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col))
p2 = PlotMystyle(p2, title="UMAP, cells coloured by G2M phase")
p = p1 + p2
p Do cells in individual clusters have particularly high counts, detected genes or mitochondrial content?
p = Seurat::FeaturePlot(sc, features=features, cols=c("lightgrey", param$col), combine=FALSE)
names(p) = features
for (i in features) p[[i]] = PlotMystyle(p[[i]], title=i)
patchwork::wrap_plots(p, ncol=2)Do cells in individual clusters express provided known marker genes?
g = unique(unlist(param$known_markers))
g = g[length(g):1]
p = Seurat::DotPlot(sc, features=g, cols=c("lightgrey", param$col)) +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5))
p = PlotMystyle(p, title="Dot plot of known marker genes")
pWe next identify genes that are differentially expressed in one cluster compared to all other clusters. Additional gene annotation is added, and the resulting tables are written to file.
# We load and unload the MAST R package in this chunk, as it overwrites Seurat functions
library(MAST)
# Find markers for every cluster compared to all remaining cells, report positive and negative ones
# min.pct = requires feature to be detected at this minimum percentage in either of the two groups of cells
# logfc.threshold = requires a feature to be differentially expressed on average by some amount between the two groups
# only.pos = find only positive markers
# Review recommends using "MAST"; Mathias uses "LR"
# ALWAYS USE: assay="RNA" or assay="SCT"
# DONT USE: assay=integrated datasets; this data is normalised and contains only 2k genes
sc_markers = Seurat::FindAllMarkers(sc, assay="RNA", only.pos=FALSE, min.pct=0.25, logfc.threshold=0.25, test.use="MAST")
sc_markers_top2 = sc_markers %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=2, wt=avg_logFC) %>%
as.data.frame()
# Show top 2 merkers per cluster
knitr::kable(sc_markers_top2, align="l", caption="Top 2 DEGs per cell cluster") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| p_val | avg_logFC | pct.1 | pct.2 | p_val_adj | cluster | gene |
|---|---|---|---|---|---|---|
| 0 | 3.8541415 | 1.000 | 0.154 | 0 | 1 | S100A8 |
| 0 | 3.5038506 | 1.000 | 0.202 | 0 | 1 | S100A9 |
| 0 | 1.3106631 | 0.926 | 0.274 | 0 | 2 | IL7R |
| 0 | 1.0562344 | 0.956 | 0.307 | 0 | 2 | IL32 |
| 0 | 0.9381407 | 1.000 | 0.718 | 0 | 3 | LDHB |
| 0 | 0.9501478 | 0.922 | 0.328 | 0 | 3 | TCF7 |
| 0 | 2.0831939 | 0.843 | 0.046 | 0 | 4 | GZMK |
| 0 | 1.8677970 | 0.732 | 0.140 | 0 | 4 | KLRB1 |
| 0 | 3.0537769 | 0.992 | 0.062 | 0 | 5 | IGHM |
| 0 | 2.7526924 | 0.616 | 0.019 | 0 | 5 | IGLC2 |
| 0 | 1.7487947 | 1.000 | 0.295 | 0 | 6 | CST3 |
| 0 | 1.6637747 | 1.000 | 0.515 | 0 | 6 | HLA-DPA1.2 |
| 0 | 4.0670280 | 0.933 | 0.296 | 0 | 7 | IGKC |
| 0 | 4.2867207 | 0.617 | 0.047 | 0 | 7 | IGHA1 |
| 0 | 3.9021643 | 0.981 | 0.072 | 0 | 8 | GNLY |
| 0 | 2.9158113 | 1.000 | 0.186 | 0 | 8 | NKG7 |
| 0 | 5.0470427 | 1.000 | 0.237 | 0 | 9 | NRGN |
| 0 | 5.8976484 | 1.000 | 0.042 | 0 | 9 | PPBP |
# Add Ensembl annotation
sc_markers_ensembl = seurat_rowname_to_ensembl[sc_markers[,"gene"]]
sc_markers_annot = cbind(sc_markers, annot_ensembl[sc_markers_ensembl,])
# Output in Excel sheet
sc_markers_lst = lapply(levels(sc_markers_annot$cluster), function(x) {sc_markers_annot %>% dplyr::filter(cluster==x)})
names(sc_markers_lst) = paste0("cluster", levels(sc_markers$cluster))
openxlsx::write.xlsx(sc_markers_lst, file=paste0(param$path_out, "/markers.xlsx"))
# Filter markers based on p-value and fold-change
sc_markers_filt = sc_markers %>%
dplyr::filter(p_val_adj <= param$padj) %>%
dplyr::filter((avg_logFC <= -param$log2fc) | (avg_logFC >= param$log2fc)) %>%
as.data.frame()
sc_markers_filt_down = sc_markers_filt %>%
dplyr::filter(avg_logFC <= -param$log2fc) %>%
as.data.frame()
sc_markers_filt_up = sc_markers_filt %>%
dplyr::filter(avg_logFC >= param$log2fc) %>%
as.data.frame()
# Number of DEGs per cluster
cluster_all = sort(unique(sc_markers[,"cluster"]))
sc_markers_filt_n = cbind(Cluster=cluster_all,
Down=sapply(cluster_all, function(x) sum(sc_markers_filt_down$cluster == x)),
Up=sapply(cluster_all, function(x) sum(sc_markers_filt_up$cluster == x))) %>%
as.data.frame() %>%
tidyr::pivot_longer(cols=c("Down", "Up"),
names_to="Direction",
values_to="n")
sc_markers_filt_n$Cluster = as.factor(sc_markers_filt_n$Cluster)
p = ggplot(sc_markers_filt_n, aes(x=Cluster, y=n, fill=Direction)) + geom_bar(stat="identity")
p = PlotMystyle(p,
title=paste0("Number of DEGs per cell cluster\n(FC=", 2^param$log2fc, ", adj. p-value=", param$padj, ")"),
col=c("steelblue", "darkgoldenrod1"))
pThe following plots are exemplary to how we can visualize differentially expressed genes using the “Seurat” R-package. The selected genes are the top differentially expressed genes for all clusters, respectively.
# Get top gene per cluster and plot
genes_example = sc_markers %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=1, wt=avg_logFC) %>%
dplyr::pull(gene) %>%
unique()
# Shows gene expression on the UMAP
p = Seurat::FeaturePlot(sc, features=genes_example, cols=c("lightgrey", param$col), combine=FALSE)
names(p) = genes_example
for (i in names(p)) p[[i]] = PlotMystyle(p[[i]], title=i)
p = patchwork::wrap_plots(p, ncol=2) +
patchwork::plot_annotation(title="UMAP, cells coloured by normalised gene expression data")
p# Ridge plot of raw gene expression counts
p = Seurat::RidgePlot(sc, features=genes_example, slot="counts", combine=FALSE)
names(p) = genes_example
for (i in names(p)) p[[i]] = PlotMystyle(p[[i]], title=i, legend_title="Cell identity")
p = patchwork::wrap_plots(p, ncol=2) +
patchwork::plot_annotation(title="Ridge plot of raw gene expression counts") +
patchwork::plot_layout(guides = "collect") &
theme(legend.position="bottom")
p# Ridge plot of normalised gene expression data
p = Seurat::RidgePlot(sc, features=genes_example, combine=FALSE)
names(p) = genes_example
for (i in names(p)) p[[i]] = PlotMystyle(p[[i]], title=i, legend_title="Cell identity")
p = patchwork::wrap_plots(p, ncol=2) +
patchwork::plot_annotation(title="Ridge plot of normalised gene expression data") +
patchwork::plot_layout(guides = "collect") &
theme(legend.position="bottom")
p# Visualises how feature expression changes across different clusters
p = Seurat::DotPlot(sc, features=genes_example[length(genes_example):1], cols=c("lightgrey", param$col))
p = PlotMystyle(p, title="Dot plot of normalised gene expression data")
p# Heatmap of top differentially expressed genes
top = sc_markers %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=10, wt=avg_logFC)
p = Seurat::DoHeatmap(sc, features=top$gene) + NoLegend()
pTo gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms amongst up- and down-regulated genes of each cluster. Over-represented terms are written to file.
We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website (Chen 2020). You can choose to test functional enrichment from a wide range of databases:
dbs_all = enrichR::listEnrichrDbs()
knitr::kable(dbs_all, align="l", caption="Enrichr databases") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="300px")| geneCoverage | genesPerTerm | libraryName | link | numTerms |
|---|---|---|---|---|
| 13362 | 275 | Genome_Browser_PWMs | http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/ | 615 |
| 27884 | 1284 | TRANSFAC_and_JASPAR_PWMs | http://jaspar.genereg.net/html/DOWNLOAD/ | 326 |
| 6002 | 77 | Transcription_Factor_PPIs | 290 | |
| 47172 | 1370 | ChEA_2013 | http://amp.pharm.mssm.edu/lib/cheadownload.jsp | 353 |
| 47107 | 509 | Drug_Perturbations_from_GEO_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 701 |
| 21493 | 3713 | ENCODE_TF_ChIP-seq_2014 | http://genome.ucsc.edu/ENCODE/downloads.html | 498 |
| 1295 | 18 | BioCarta_2013 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 249 |
| 3185 | 73 | Reactome_2013 | http://www.reactome.org/download/index.html | 78 |
| 2854 | 34 | WikiPathways_2013 | http://www.wikipathways.org/index.php/Download_Pathways | 199 |
| 15057 | 300 | Disease_Signatures_from_GEO_up_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 |
| 4128 | 48 | KEGG_2013 | http://www.kegg.jp/kegg/download/ | 200 |
| 34061 | 641 | TF-LOF_Expression_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 269 |
| 7504 | 155 | TargetScan_microRNA | http://www.targetscan.org/cgi-bin/targetscan/data_download.cgi?db=vert_61 | 222 |
| 16399 | 247 | PPI_Hub_Proteins | http://amp.pharm.mssm.edu/X2K | 385 |
| 12753 | 57 | GO_Molecular_Function_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 1136 |
| 23726 | 127 | GeneSigDB | http://genesigdb.org/genesigdb/downloadall.jsp | 2139 |
| 32740 | 85 | Chromosome_Location | http://software.broadinstitute.org/gsea/msigdb/index.jsp | 386 |
| 13373 | 258 | Human_Gene_Atlas | http://biogps.org/downloads/ | 84 |
| 19270 | 388 | Mouse_Gene_Atlas | http://biogps.org/downloads/ | 96 |
| 13236 | 82 | GO_Cellular_Component_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 641 |
| 14264 | 58 | GO_Biological_Process_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 5192 |
| 3096 | 31 | Human_Phenotype_Ontology | http://www.human-phenotype-ontology.org/ | 1779 |
| 22288 | 4368 | Epigenomics_Roadmap_HM_ChIP-seq | http://www.roadmapepigenomics.org/ | 383 |
| 4533 | 37 | KEA_2013 | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 474 |
| 10231 | 158 | NURSA_Human_Endogenous_Complexome | https://www.nursa.org/nursa/index.jsf | 1796 |
| 2741 | 5 | CORUM | http://mips.helmholtz-muenchen.de/genre/proj/corum/ | 1658 |
| 5655 | 342 | SILAC_Phosphoproteomics | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 84 |
| 10406 | 715 | MGI_Mammalian_Phenotype_Level_3 | http://www.informatics.jax.org/ | 71 |
| 10493 | 200 | MGI_Mammalian_Phenotype_Level_4 | http://www.informatics.jax.org/ | 476 |
| 11251 | 100 | Old_CMAP_up | http://www.broadinstitute.org/cmap/ | 6100 |
| 8695 | 100 | Old_CMAP_down | http://www.broadinstitute.org/cmap/ | 6100 |
| 1759 | 25 | OMIM_Disease | http://www.omim.org/downloads | 90 |
| 2178 | 89 | OMIM_Expanded | http://www.omim.org/downloads | 187 |
| 851 | 15 | VirusMINT | http://mint.bio.uniroma2.it/download.html | 85 |
| 10061 | 106 | MSigDB_Computational | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 858 |
| 11250 | 166 | MSigDB_Oncogenic_Signatures | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 189 |
| 15406 | 300 | Disease_Signatures_from_GEO_down_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 |
| 17711 | 300 | Virus_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 323 |
| 17576 | 300 | Virus_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 323 |
| 15797 | 176 | Cancer_Cell_Line_Encyclopedia | https://portals.broadinstitute.org/ccle/home | 967 |
| 12232 | 343 | NCI-60_Cancer_Cell_Lines | http://biogps.org/downloads/ | 93 |
| 13572 | 301 | Tissue_Protein_Expression_from_ProteomicsDB | https://www.proteomicsdb.org/ | 207 |
| 6454 | 301 | Tissue_Protein_Expression_from_Human_Proteome_Map | http://www.humanproteomemap.org/index.php | 30 |
| 3723 | 47 | HMDB_Metabolites | http://www.hmdb.ca/downloads | 3906 |
| 7588 | 35 | Pfam_InterPro_Domains | ftp://ftp.ebi.ac.uk/pub/databases/interpro/ | 311 |
| 7682 | 78 | GO_Biological_Process_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 941 |
| 7324 | 172 | GO_Cellular_Component_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 205 |
| 8469 | 122 | GO_Molecular_Function_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 402 |
| 13121 | 305 | Allen_Brain_Atlas_up | http://www.brain-map.org/ | 2192 |
| 26382 | 1811 | ENCODE_TF_ChIP-seq_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 816 |
| 29065 | 2123 | ENCODE_Histone_Modifications_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 412 |
| 280 | 9 | Phosphatase_Substrates_from_DEPOD | http://www.koehn.embl.de/depod/ | 59 |
| 13877 | 304 | Allen_Brain_Atlas_down | http://www.brain-map.org/ | 2192 |
| 15852 | 912 | ENCODE_Histone_Modifications_2013 | http://genome.ucsc.edu/ENCODE/downloads.html | 109 |
| 4320 | 129 | Achilles_fitness_increase | http://www.broadinstitute.org/achilles | 216 |
| 4271 | 128 | Achilles_fitness_decrease | http://www.broadinstitute.org/achilles | 216 |
| 10496 | 201 | MGI_Mammalian_Phenotype_2013 | http://www.informatics.jax.org/ | 476 |
| 1678 | 21 | BioCarta_2015 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 239 |
| 756 | 12 | HumanCyc_2015 | http://humancyc.org/ | 125 |
| 3800 | 48 | KEGG_2015 | http://www.kegg.jp/kegg/download/ | 179 |
| 2541 | 39 | NCI-Nature_2015 | http://pid.nci.nih.gov/ | 209 |
| 1918 | 39 | Panther_2015 | http://www.pantherdb.org/ | 104 |
| 5863 | 51 | WikiPathways_2015 | http://www.wikipathways.org/index.php/Download_Pathways | 404 |
| 6768 | 47 | Reactome_2015 | http://www.reactome.org/download/index.html | 1389 |
| 25651 | 807 | ESCAPE | http://www.maayanlab.net/ESCAPE/ | 315 |
| 19129 | 1594 | HomoloGene | http://www.ncbi.nlm.nih.gov/homologene | 12 |
| 23939 | 293 | Disease_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 839 |
| 23561 | 307 | Disease_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 839 |
| 23877 | 302 | Drug_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 906 |
| 15886 | 9 | Genes_Associated_with_NIH_Grants | https://grants.nih.gov/grants/oer.htm | 32876 |
| 24350 | 299 | Drug_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 906 |
| 3102 | 25 | KEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 428 |
| 31132 | 298 | Gene_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 2460 |
| 30832 | 302 | Gene_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 2460 |
| 48230 | 1429 | ChEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 395 |
| 5613 | 36 | dbGaP | http://www.ncbi.nlm.nih.gov/gap | 345 |
| 9559 | 73 | LINCS_L1000_Chem_Pert_up | https://clue.io/ | 33132 |
| 9448 | 63 | LINCS_L1000_Chem_Pert_down | https://clue.io/ | 33132 |
| 16725 | 1443 | GTEx_Tissue_Sample_Gene_Expression_Profiles_down | http://www.gtexportal.org/ | 2918 |
| 19249 | 1443 | GTEx_Tissue_Sample_Gene_Expression_Profiles_up | http://www.gtexportal.org/ | 2918 |
| 15090 | 282 | Ligand_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 261 |
| 16129 | 292 | Aging_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 286 |
| 15309 | 308 | Aging_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 286 |
| 15103 | 318 | Ligand_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 261 |
| 15022 | 290 | MCF7_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 401 |
| 15676 | 310 | MCF7_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 401 |
| 15854 | 279 | Microbe_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 312 |
| 15015 | 321 | Microbe_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 312 |
| 3788 | 159 | LINCS_L1000_Ligand_Perturbations_down | https://clue.io/ | 96 |
| 3357 | 153 | LINCS_L1000_Ligand_Perturbations_up | https://clue.io/ | 96 |
| 12668 | 300 | L1000_Kinase_and_GPCR_Perturbations_down | https://clue.io/ | 3644 |
| 12638 | 300 | L1000_Kinase_and_GPCR_Perturbations_up | https://clue.io/ | 3644 |
| 8973 | 64 | Reactome_2016 | http://www.reactome.org/download/index.html | 1530 |
| 7010 | 87 | KEGG_2016 | http://www.kegg.jp/kegg/download/ | 293 |
| 5966 | 51 | WikiPathways_2016 | http://www.wikipathways.org/index.php/Download_Pathways | 437 |
| 15562 | 887 | ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X | 104 | |
| 17850 | 300 | Kinase_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 285 |
| 17660 | 300 | Kinase_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 285 |
| 1348 | 19 | BioCarta_2016 | http://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 237 |
| 934 | 13 | HumanCyc_2016 | http://humancyc.org/ | 152 |
| 2541 | 39 | NCI-Nature_2016 | http://pid.nci.nih.gov/ | 209 |
| 2041 | 42 | Panther_2016 | http://www.pantherdb.org/pathway/ | 112 |
| 5209 | 300 | DrugMatrix | https://ntp.niehs.nih.gov/drugmatrix/ | 7876 |
| 49238 | 1550 | ChEA_2016 | http://amp.pharm.mssm.edu/Enrichr | 645 |
| 2243 | 19 | huMAP | http://proteincomplexes.org/ | 995 |
| 19586 | 545 | Jensen_TISSUES | http://tissues.jensenlab.org/ | 1842 |
| 22440 | 505 | RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 1302 |
| 8184 | 24 | MGI_Mammalian_Phenotype_2017 | http://www.informatics.jax.org/ | 5231 |
| 18329 | 161 | Jensen_COMPARTMENTS | http://compartments.jensenlab.org/ | 2283 |
| 15755 | 28 | Jensen_DISEASES | http://diseases.jensenlab.org/ | 1811 |
| 10271 | 22 | BioPlex_2017 | http://bioplex.hms.harvard.edu/ | 3915 |
| 10427 | 38 | GO_Cellular_Component_2017 | http://www.geneontology.org/ | 636 |
| 10601 | 25 | GO_Molecular_Function_2017 | http://www.geneontology.org/ | 972 |
| 13822 | 21 | GO_Biological_Process_2017 | http://www.geneontology.org/ | 3166 |
| 8002 | 143 | GO_Cellular_Component_2017b | http://www.geneontology.org/ | 816 |
| 10089 | 45 | GO_Molecular_Function_2017b | http://www.geneontology.org/ | 3271 |
| 13247 | 49 | GO_Biological_Process_2017b | http://www.geneontology.org/ | 10125 |
| 21809 | 2316 | ARCHS4_Tissues | http://amp.pharm.mssm.edu/archs4 | 108 |
| 23601 | 2395 | ARCHS4_Cell-lines | http://amp.pharm.mssm.edu/archs4 | 125 |
| 20883 | 299 | ARCHS4_IDG_Coexp | http://amp.pharm.mssm.edu/archs4 | 352 |
| 19612 | 299 | ARCHS4_Kinases_Coexp | http://amp.pharm.mssm.edu/archs4 | 498 |
| 25983 | 299 | ARCHS4_TFs_Coexp | http://amp.pharm.mssm.edu/archs4 | 1724 |
| 19500 | 137 | SysMyo_Muscle_Gene_Sets | http://sys-myo.rhcloud.com/ | 1135 |
| 14893 | 128 | miRTarBase_2017 | http://mirtarbase.mbc.nctu.edu.tw/ | 3240 |
| 17598 | 1208 | TargetScan_microRNA_2017 | http://www.targetscan.org/ | 683 |
| 5902 | 109 | Enrichr_Libraries_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 121 |
| 12486 | 299 | Enrichr_Submissions_TF-Gene_Coocurrence | http://amp.pharm.mssm.edu/Enrichr | 1722 |
| 1073 | 100 | Data_Acquisition_Method_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 12 |
| 19513 | 117 | DSigDB | http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/ | 4026 |
| 14433 | 36 | GO_Biological_Process_2018 | http://www.geneontology.org/ | 5103 |
| 8655 | 61 | GO_Cellular_Component_2018 | http://www.geneontology.org/ | 446 |
| 11459 | 39 | GO_Molecular_Function_2018 | http://www.geneontology.org/ | 1151 |
| 19741 | 270 | TF_Perturbations_Followed_by_Expression | http://www.ncbi.nlm.nih.gov/geo/ | 1958 |
| 27360 | 802 | Chromosome_Location_hg19 | http://hgdownload.cse.ucsc.edu/downloads.html | 36 |
| 13072 | 26 | NIH_Funded_PIs_2017_Human_GeneRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 5687 |
| 13464 | 45 | NIH_Funded_PIs_2017_Human_AutoRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 |
| 13787 | 200 | Rare_Diseases_AutoRIF_ARCHS4_Predictions | https://amp.pharm.mssm.edu/geneshot/ | 3725 |
| 13929 | 200 | Rare_Diseases_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 |
| 16964 | 200 | NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 |
| 17258 | 200 | NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 5684 |
| 10352 | 58 | Rare_Diseases_GeneRIF_Gene_Lists | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 |
| 10471 | 76 | Rare_Diseases_AutoRIF_Gene_Lists | https://amp.pharm.mssm.edu/geneshot/ | 3725 |
| 12419 | 491 | SubCell_BarCode | http://www.subcellbarcode.org/ | 104 |
| 19378 | 37 | GWAS_Catalog_2019 | https://www.ebi.ac.uk/gwas | 1737 |
| 6201 | 45 | WikiPathways_2019_Human | https://www.wikipathways.org/ | 472 |
| 4558 | 54 | WikiPathways_2019_Mouse | https://www.wikipathways.org/ | 176 |
| 3264 | 22 | TRRUST_Transcription_Factors_2019 | https://www.grnpedia.org/trrust/ | 571 |
| 7802 | 92 | KEGG_2019_Human | https://www.kegg.jp/ | 308 |
| 8551 | 98 | KEGG_2019_Mouse | https://www.kegg.jp/ | 303 |
| 12444 | 23 | InterPro_Domains_2019 | https://www.ebi.ac.uk/interpro/ | 1071 |
| 9000 | 20 | Pfam_Domains_2019 | https://pfam.xfam.org/ | 608 |
| 7744 | 363 | DepMap_WG_CRISPR_Screens_Broad_CellLines_2019 | https://depmap.org/ | 558 |
| 6204 | 387 | DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019 | https://depmap.org/ | 325 |
| 13420 | 32 | MGI_Mammalian_Phenotype_Level_4_2019 | http://www.informatics.jax.org/ | 5261 |
| 14148 | 122 | UK_Biobank_GWAS_v1 | https://www.ukbiobank.ac.uk/tag/gwas/ | 857 |
| 9813 | 49 | BioPlanet_2019 | https://tripod.nih.gov/bioplanet/ | 1510 |
| 1397 | 13 | ClinVar_2019 | https://www.ncbi.nlm.nih.gov/clinvar/ | 182 |
| 9116 | 22 | PheWeb_2019 | http://pheweb.sph.umich.edu/ | 1161 |
| 17464 | 63 | DisGeNET | https://www.disgenet.org | 9828 |
| 394 | 73 | HMS_LINCS_KinomeScan | http://lincs.hms.harvard.edu/kinomescan/ | 148 |
| 11851 | 586 | CCLE_Proteomics_2020 | https://portals.broadinstitute.org/ccle | 378 |
| 8189 | 421 | ProteomicsDB_2020 | https://www.proteomicsdb.org/ | 913 |
| 18704 | 100 | lncHUB_lncRNA_Co-Expression | https://amp.pharm.mssm.edu/lnchub/ | 3729 |
| 5605 | 39 | Virus-Host_PPI_P-HIPSTer_2020 | http://phipster.org/ | 6715 |
| 5718 | 31 | Elsevier_Pathway_Collection | http://www.transgene.ru/disease-pathways/ | 1721 |
| 14156 | 40 | Table_Mining_of_CRISPR_Studies | 802 |
# DEGs up and down per cluster
cluster_all = sort(unique(sc_markers[,"cluster"]))
genesets_up = lapply(cluster_all, function(x) {
tmp = sc_markers_filt_up %>%
dplyr::filter(cluster==x) %>%
dplyr::pull(gene)
# Pick the first matching Entrez symbol
tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1]) %>%
na.exclude() %>% unique()
return(tmp)
})
genesets_down = lapply(cluster_all, function(x) {
tmp = sc_markers_filt_down %>%
dplyr::filter(cluster==x) %>%
dplyr::pull(gene)
# Pick the first matching Entrez symbol
tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1]) %>%
na.exclude() %>% unique()
return(tmp)
})
names(genesets_up) = paste0("DEG_up_cluster_", cluster_all)
names(genesets_down) = paste0("DEG_down_cluster_", cluster_all)
genesets = c(genesets_up, genesets_down)
# Loop through gene lists
enriched = list()
for (i in 1:length(genesets)) {
if (length(genesets[[i]]) >= 3) {
message("Geneset ", names(genesets)[i])
enriched[[i]] = enrichR::enrichr(genesets[[i]], databases=param$enrichr_dbs)
} else {
message("Geneset ", names(genesets)[i], " has less than 3 genes; skip enrichr")
enriched[[i]] = NA
}
}
names(enriched) = names(genesets)
# Write enrichment results to file
enriched_top = matrix(NA, nrow=0, ncol=6)
colnames(enriched_top) = c("GeneSet", "Database", "Term", "Overlap", "Adjusted_pval", "Genes")
for (i in 1:length(enriched)) {
if (!is.null(enriched[[i]])) {
openxlsx::write.xlsx(enriched[[i]], file=paste0(param$path_out, "/Functions_", names(enriched)[i], ".xlsx"))
}
}The following table contains the top enriched term per geneset and datqbase.
for (i in 1:length(enriched)) {
if ((!is.null(enriched[[i]])) & (!is.na(enriched[[i]]))) {
# Remember top term per geneset
for(j in 1:length(enriched[[i]])) {
enriched_top = rbind(enriched_top,
c(names(enriched)[i],
names(enriched[[i]])[j],
enriched[[i]][[j]][1, c("Term", "Overlap", "Adjusted.P.value", "Genes")]))
}
}
}
# Print table of top terms per gene set
knitr::kable(enriched_top, align="l", caption="Top enriched term per geneset") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="700px")| GeneSet | Database | Term | Overlap | Adjusted_pval | Genes |
|---|---|---|---|---|---|
| DEG_up_cluster_1 | GO_Molecular_Function_2018 | RAGE receptor binding (GO:0050786) | 4/9 | 0.00678181447009466 | S100A12;S100A4;S100A9;S100A8 |
| DEG_up_cluster_1 | GO_Biological_Process_2018 | neutrophil activation involved in immune response (GO:0002283) | 86/483 | 1.32400184581104e-64 | FCN1;CDA;SERPINA1;ITGAM;RAB3D;CTSZ;ITGB2;PYGL;CTSS;SIRPB1;PYCARD;LGALS3;GLIPR1;FTH1;ANPEP;TIMP2;COTL1;CTSH;SIRPA;CD36;RAC1;CTSD;CD33;CTSB;ACTR2;SERPINB1;FCER1G;ANXA2;CD93;SLC11A1;PGAM1;ATP6AP2;PLAUR;CYBB;MCEMP1;TNFRSF1B;CKAP4;FGR;TYROBP;RAB31;NPC2;PECAM1;S100A9;CD44;S100A8;TLR2;FTL;CFD;GRN;ASAH1;CLEC12A;BRI3;GSTP1;STXBP2;PTAFR;C5AR1;FGL2;FPR1;MGST1;IQGAP1;RETN;CFP;LILRA2;GNS;CST3;SDCBP;ALOX5;CREG1;PSAP;S100A12;CD14;LTA4H;S100A11;GCA;ADA2;LILRB2;LYZ;LILRB3;CPPED1;BST1;FCGR2A;QPCT;MNDA;LAMTOR1;CD68;HSPA1A |
| DEG_up_cluster_1 | GO_Cellular_Component_2018 | ficolin-1-rich granule (GO:0101002) | 36/184 | 3.85625435133997e-27 | FCN1;CFD;CDA;ASAH1;SERPINA1;GSTP1;CTSZ;ITGB2;FGL2;FPR1;PYGL;GNS;CTSS;CST3;LGALS3;FTH1;ALOX5;COTL1;TIMP2;SIRPA;CTSH;LTA4H;RAC1;CTSD;CTSB;ACTR2;FCER1G;CD93;SLC11A1;PGAM1;ATP6AP2;LILRB2;QPCT;MNDA;LAMTOR1;HSPA1A |
| DEG_up_cluster_2 | GO_Molecular_Function_2018 | T cell receptor binding (GO:0042608) | 2/6 | 0.0131794134484363 | CD3G;CD3E |
| DEG_up_cluster_2 | GO_Biological_Process_2018 | T cell activation (GO:0042110) | 4/88 | 0.00521173489558323 | CD2;CD3G;CD3E;CD3D |
| DEG_up_cluster_2 | GO_Cellular_Component_2018 | T cell receptor complex (GO:0042101) | 3/16 | 0.000151745632243574 | CD3G;CD3E;CD3D |
| DEG_up_cluster_3 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 23/1387 | 1.28127730031412e-13 | RPL5;RPL30;NPM1;RPL32;RPS5;RPS6;RPL35A;NOSIP;RPS3A;RPS14;RPS25;RPS28;RPS15A;RPS16;RPS27;RPS18;RCAN3;RPL36;RPS3;RPL38;RPS27A;RPS21;RPL19 |
| DEG_up_cluster_3 | GO_Biological_Process_2018 | SRP-dependent cotranslational protein targeting to membrane (GO:0006614) | 21/89 | 4.11147818898702e-36 | RPL5;RPL30;RPL32;RPS5;RPS6;RPL35A;RPS3A;RPS14;RPS25;RPS28;RPS15A;RPS16;RPS27;RPS18;RPS29;RPL36;RPS3;RPL38;RPS27A;RPS21;RPL19 |
| DEG_up_cluster_3 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 21/124 | 7.9444055044538e-34 | RPL5;RPL30;RPL32;RPS5;RPS6;RPL35A;RPS3A;RPS14;RPS25;RPS28;RPS15A;RPS16;RPS27;RPS18;RPS29;RPL36;RPS3;RPL38;RPS27A;RPS21;RPL19 |
| DEG_up_cluster_4 | GO_Molecular_Function_2018 | T cell receptor binding (GO:0042608) | 3/6 | 0.000278579288842565 | LCK;CD3G;CD3E |
| DEG_up_cluster_4 | GO_Biological_Process_2018 | regulation of immune response (GO:0050776) | 11/251 | 5.82769441677551e-08 | NCR3;KLRB1;CD8B;CD81;CD8A;CD3G;KLRD1;CD247;CD3E;CD3D;HCST |
| DEG_up_cluster_4 | GO_Cellular_Component_2018 | T cell receptor complex (GO:0042101) | 7/16 | 1.2443646138536e-12 | ZAP70;CD8B;CD8A;CD3G;CD247;CD3E;CD3D |
| DEG_up_cluster_5 | GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | 8/16 | 2.30714097063199e-12 | CD74;HLA-DMA;HLA-DMB;CD81;HLA-DRA;HLA-DOB;MS4A1;HLA-DRB1 |
| DEG_up_cluster_5 | GO_Biological_Process_2018 | B cell activation (GO:0042113) | 11/73 | 1.71187747009338e-10 | FCRL1;TPD52;CD79B;CD79A;CD40;MEF2C;PRKCB;BANK1;BLNK;BTK;MS4A1 |
| DEG_up_cluster_5 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 11/14 | 1.90991361405332e-21 | CD74;HLA-DRB5;HLA-DMA;HLA-DMB;HLA-DPB1;HLA-DRA;HLA-DOB;HLA-DQA1;HLA-DRB1;HLA-DPA1;HLA-DQB1 |
| DEG_up_cluster_6 | GO_Molecular_Function_2018 | MHC protein binding (GO:0042287) | 6/30 | 2.75926896738375e-05 | CD74;CD4;CLEC7A;LILRB1;PILRA;LILRB2 |
| DEG_up_cluster_6 | GO_Biological_Process_2018 | neutrophil degranulation (GO:0043312) | 33/479 | 5.4405490109824e-22 | FCN1;CFD;CSTB;GRN;SERPINA1;BRI3;GSTP1;CTSZ;FGL2;CFP;CTSS;PYCARD;CST3;LGALS3;FTH1;PSAP;COTL1;ITGAX;RAC1;S100A11;CTSB;FCER1G;ANXA2;RNASET2;RHOG;LILRB2;TNFRSF1B;CPPED1;TYROBP;NPC2;PECAM1;CD68;FTL |
| DEG_up_cluster_6 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 10/14 | 2.13917215495991e-17 | CD74;HLA-DRB5;HLA-DMA;HLA-DMB;HLA-DPB1;HLA-DRA;HLA-DQA1;HLA-DRB1;HLA-DPA1;HLA-DQB1 |
| DEG_up_cluster_7 | GO_Molecular_Function_2018 | MHC class II receptor activity (GO:0032395) | 4/10 | 9.70430382565524e-06 | HLA-DRA;HLA-DQA1;HLA-DPA1;HLA-DQB1 |
| DEG_up_cluster_7 | GO_Biological_Process_2018 | antigen receptor-mediated signaling pathway (GO:0050851) | 11/257 | 2.46486587005965e-07 | BLK;CD79B;CD79A;HLA-DRB5;CD19;HLA-DPB1;HLA-DRA;HLA-DQA1;HLA-DRB1;HLA-DPA1;HLA-DQB1 |
| DEG_up_cluster_7 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 8/14 | 1.5698916744281e-15 | CD74;HLA-DRB5;HLA-DPB1;HLA-DRA;HLA-DQA1;HLA-DRB1;HLA-DPA1;HLA-DQB1 |
| DEG_up_cluster_8 | GO_Molecular_Function_2018 | CCR chemokine receptor binding (GO:0048020) | 5/38 | 0.0281869290330875 | CCL5;CCL4;CCL3;XCL2;XCL1 |
| DEG_up_cluster_8 | GO_Biological_Process_2018 | regulation of immune response (GO:0050776) | 24/251 | 5.07657238137779e-14 | KLRB1;CD81;CD300A;CD160;SH2D1A;ITGB2;HLA-B;SH2D1B;HLA-C;PILRB;HLA-A;ITGAL;HLA-E;NCR1;FCGR3A;NCR3;TYROBP;KLRF1;SLAMF7;KLRD1;KLRC1;CD247;HCST;CD99 |
| DEG_up_cluster_8 | GO_Cellular_Component_2018 | phagocytic vesicle (GO:0045335) | 9/83 | 3.1679467518145e-05 | GNLY;RAC2;HLA-B;HLA-C;ADAM8;HLA-A;CALR;CORO1A;HLA-E |
| DEG_up_cluster_9 | GO_Molecular_Function_2018 | actin filament binding (GO:0051015) | 19/127 | 4.91752391029081e-07 | DBNL;DMTN;TPM4;WDR1;WIPF1;TPM3;ACTN1;ARPC1B;TPM1;TNNC2;TWF2;NEXN;ARPC4;ARPC5;CORO1C;SAMD14;ABLIM3;MYH9;FLNA |
| DEG_up_cluster_9 | GO_Biological_Process_2018 | platelet degranulation (GO:0002576) | 32/124 | 6.43436151930819e-20 | APP;SPARC;WDR1;ITGA2B;PLEK;ENDOD1;F13A1;PDGFA;CLU;THBS1;CYB5R1;TMSB4X;FLNA;TIMP1;LY6G6F;VTI1B;SRGN;TGFB1;ACTN1;RAB27B;PPBP;GTPBP2;TUBA4A;SELP;MMRN1;CD9;TAGLN2;MAGED2;TLN1;VCL;FERMT3;PF4 |
| DEG_up_cluster_9 | GO_Cellular_Component_2018 | platelet alpha granule (GO:0031091) | 25/90 | 6.20012087751719e-17 | APP;SPARC;ITGA2B;F13A1;PDGFA;CLU;THBS1;CYB5R1;TMSB4X;TIMP1;LY6G6F;VTI1B;SNCA;SRGN;TGFB1;ACTN1;PPBP;GTPBP2;SELP;VAMP7;MMRN1;CD9;MAGED2;PF4;FERMT3 |
| DEG_down_cluster_1 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 52/1387 | 2.32651881903912e-24 | RPL4;RPL5;RPL3;RPL32;RPLP0;PEBP1;RPL10A;RPS15;RPS4X;RPS14;RPL7A;RPS16;RPS19;RPL18A;RPS18;RPL36;RPL35;RPL38;RPL37;RPS10;RPL21;RPS8;RPS5;RPS6;NOSIP;RPL13A;RPSA;RPS3A;RPL37A;ARHGEF1;RPL29;RPL10;RPL12;RPL11;RPS15A;RPS3;RPL14;RPL13;RPS2;RPS27A;RPL18;RPL19;HSPA8;RPL41;NPM1;RPL23A;RPS25;RPS28;RPS27;RPS21;NOP53;RPS23 |
| DEG_down_cluster_1 | GO_Biological_Process_2018 | SRP-dependent cotranslational protein targeting to membrane (GO:0006614) | 48/89 | 2.15022968946199e-81 | RPL4;RPL5;RPL3;RPL32;RPL10;RPL12;RPLP0;RPL11;RPL10A;RPS4X;RPS15;RPL7A;RPS14;RPS16;RPS15A;RPL18A;RPS19;RPS18;RPL36;RPL14;RPS3;RPL35;RPLP2;RPL13;RPL38;RPL37;RPS2;RPL18;RPS27A;RPS10;RPL19;RPL41;RPL21;RPS8;RPS5;RPS6;RPL13A;RPS3A;RPSA;RPL23A;RPS25;RPS28;RPS27;RPS29;RPL37A;RPL29;RPS21;RPS23 |
| DEG_down_cluster_1 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 48/124 | 2.66401972736485e-73 | RPL4;RPL5;RPL3;RPL32;RPL10;RPL12;RPLP0;RPL11;RPL10A;RPS15;RPS4X;RPS14;RPL7A;RPS16;RPS15A;RPS19;RPL18A;RPS18;RPL36;RPL14;RPS3;RPL35;RPLP2;RPL13;RPL38;RPL37;RPS2;RPL18;RPS27A;RPS10;RPL19;RPL41;RPL21;RPS8;RPS5;RPS6;RPL13A;RPSA;RPS3A;RPL23A;RPS25;RPS28;RPS27;RPS29;RPL37A;RPL29;RPS21;RPS23 |
| DEG_down_cluster_2 | GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | 5/16 | 6.87710884490933e-05 | CD74;HLA-DMA;HLA-DMB;HLA-DRA;HLA-DRB1 |
| DEG_down_cluster_2 | GO_Biological_Process_2018 | neutrophil degranulation (GO:0043312) | 47/479 | 9.80276810457355e-38 | FCN1;CFD;GRN;ASAH1;SERPINA1;ITGAM;CLEC12A;BRI3;HVCN1;GSTP1;C5AR1;STXBP2;CTSZ;FGL2;FPR1;CFP;CTSS;CST3;LGALS3;SDCBP;FTH1;PSAP;COTL1;TIMP2;S100A12;CTSH;CD14;LTA4H;CD36;RAC1;CTSD;FCER1G;GCA;SLC11A1;CYBB;CYBA;LYZ;FGR;TYROBP;FCGR2A;NPC2;PECAM1;MNDA;CD68;S100A9;S100A8;FTL |
| DEG_down_cluster_2 | GO_Cellular_Component_2018 | tertiary granule (GO:0070820) | 21/164 | 2.79578066724644e-18 | ASAH1;ITGAM;FCER1G;CLEC12A;SLC11A1;STX7;STXBP2;FPR1;CYBB;CYBA;CFP;LYZ;CTSS;CST3;LGALS3;FTH1;TIMP2;CTSH;LTA4H;RAC1;CTSD |
| DEG_down_cluster_3 | GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | 5/16 | 0.000385622575694362 | CD74;HLA-DMA;HLA-DMB;HLA-DRA;HLA-DRB1 |
| DEG_down_cluster_3 | GO_Biological_Process_2018 | neutrophil degranulation (GO:0043312) | 62/479 | 1.01962325578583e-48 | FCN1;SERPINA1;ITGAM;CTSZ;ITGB2;CTSS;PYCARD;LGALS3;GLIPR1;FTH1;TIMP2;COTL1;CTSH;CD36;RAC1;CTSD;CTSB;CAP1;SERPINB1;FCER1G;ANXA2;SLC11A1;PGAM1;CYBB;CYBA;DYNLL1;TNFRSF1B;RHOA;FGR;TYROBP;NPC2;PECAM1;S100A9;S100A8;FTL;CFD;CD63;GRN;ASAH1;CLEC12A;BRI3;GSTP1;HVCN1;STXBP2;C5AR1;FGL2;FPR1;IQGAP1;CFP;CST3;ALOX5;PSAP;S100A12;CD14;LTA4H;S100A11;CTSA;GCA;LYZ;MNDA;PTPN6;CD68 |
| DEG_down_cluster_3 | GO_Cellular_Component_2018 | ficolin-1-rich granule (GO:0101002) | 27/184 | 1.67690243473523e-21 | FCN1;CFD;ASAH1;SERPINA1;GSTP1;CTSZ;ITGB2;FGL2;FPR1;CTSS;CST3;LGALS3;FTH1;ALOX5;COTL1;TIMP2;CTSH;LTA4H;RAC1;CTSD;CTSB;FCER1G;SLC11A1;PGAM1;DYNLL1;RHOA;MNDA |
| DEG_down_cluster_4 | GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | 5/16 | 3.70491314281004e-05 | CD74;HLA-DMA;HLA-DMB;HLA-DRA;HLA-DRB1 |
| DEG_down_cluster_4 | GO_Biological_Process_2018 | neutrophil degranulation (GO:0043312) | 42/479 | 1.23476845929677e-33 | FCN1;CFD;GRN;ASAH1;SERPINA1;CLEC12A;BRI3;HVCN1;GSTP1;C5AR1;CTSZ;FGL2;FPR1;CFP;CTSS;CST3;SDCBP;FTH1;PSAP;COTL1;TIMP2;S100A12;CD14;LTA4H;CD36;RAC1;S100A11;CTSB;FCER1G;GCA;SLC11A1;CYBB;ADA2;LYZ;FGR;TYROBP;NPC2;MNDA;CD68;S100A9;S100A8;FTL |
| DEG_down_cluster_4 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 10/14 | 2.13917215495991e-17 | CD74;HLA-DRB5;HLA-DMA;HLA-DMB;HLA-DPB1;HLA-DRA;HLA-DQA1;HLA-DRB1;HLA-DPA1;HLA-DQB1 |
| DEG_down_cluster_5 | GO_Molecular_Function_2018 | RAGE receptor binding (GO:0050786) | 4/9 | 0.000642574049815309 | S100A12;S100A4;S100A9;S100A8 |
| DEG_down_cluster_5 | GO_Biological_Process_2018 | neutrophil degranulation (GO:0043312) | 43/479 | 1.33577796021931e-28 | FCN1;CFD;CD63;GRN;SERPINA1;CLEC12A;BRI3;GSTP1;C5AR1;STXBP2;ITGB2;FGL2;FPR1;CFP;CTSS;CST3;LGALS3;SDCBP;ADGRE5;PSAP;COTL1;TIMP2;S100A12;CD14;CD36;RAC1;CTSD;S100A11;CTSC;CTSB;CTSA;FCER1G;ANXA2;SLC11A1;LYZ;TNFRSF1B;TYROBP;PKM;MNDA;CD68;S100A9;S100A8;CD44 |
| DEG_down_cluster_5 | GO_Cellular_Component_2018 | ficolin-1-rich granule (GO:0101002) | 19/184 | 4.73231648701155e-13 | FCN1;CFD;SERPINA1;FCER1G;SLC11A1;GSTP1;ITGB2;FGL2;FPR1;CTSS;CST3;LGALS3;PKM;COTL1;TIMP2;MNDA;RAC1;CTSD;CTSB |
| DEG_down_cluster_6 | GO_Molecular_Function_2018 | T cell receptor binding (GO:0042608) | 3/6 | 0.000243843564575923 | LCK;CD3G;CD3E |
| DEG_down_cluster_6 | GO_Biological_Process_2018 | T cell activation (GO:0042110) | 8/88 | 9.77097809734236e-08 | CD2;LCK;CD7;RHOH;CD3G;CD3E;CD3D;LAT |
| DEG_down_cluster_6 | GO_Cellular_Component_2018 | T cell receptor complex (GO:0042101) | 4/16 | 1.77903609776843e-05 | CD3G;CD247;CD3E;CD3D |
| DEG_down_cluster_7 | GO_Molecular_Function_2018 | RAGE receptor binding (GO:0050786) | 4/9 | 0.00035340769107794 | S100A12;S100A4;S100A9;S100A8 |
| DEG_down_cluster_7 | GO_Biological_Process_2018 | neutrophil degranulation (GO:0043312) | 39/479 | 1.27577353655812e-26 | FCN1;CFD;GRN;ASAH1;SERPINA1;BRI3;GSTP1;C5AR1;ITGB2;FGL2;FPR1;CFP;CTSS;CST3;LGALS3;SDCBP;PSAP;S100A12;CD14;RAC1;CD36;CTSD;S100A11;CTSC;CTSB;CAP1;FCER1G;SLC11A1;CYBB;LYZ;TNFRSF1B;TYROBP;NPC2;PECAM1;MNDA;CD68;S100A9;S100A8;CD44 |
| DEG_down_cluster_7 | GO_Cellular_Component_2018 | ficolin-1-rich granule (GO:0101002) | 17/184 | 8.52999030200048e-12 | FCN1;CFD;ASAH1;SERPINA1;FCER1G;SLC11A1;GSTP1;ITGB2;FGL2;FPR1;CTSS;CST3;LGALS3;MNDA;RAC1;CTSD;CTSB |
| DEG_down_cluster_8 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 31/1387 | 1.81927015184966e-07 | RPL30;GRN;RPL32;RPL34;RPLP0;RPL11;RPL8;RPL9;YBX3;RPL7;LGALS3;RPS16;RPL18A;RPS18;RPL13;LTA4H;RPS11;RPL39;RPS13;RPS8;RPL22;RPL35A;NAP1L1;EEF2;RPS28;RCAN3;RPS20;KCTD12;PABPC1;VIM;RPS23 |
| DEG_down_cluster_8 | GO_Biological_Process_2018 | SRP-dependent cotranslational protein targeting to membrane (GO:0006614) | 21/89 | 2.40994513167354e-24 | RPL30;RPL32;RPS8;RPL34;RPLP0;RPL22;RPL11;RPL35A;RPL8;RPL9;RPL7;RPS28;RPS16;RPL18A;RPS18;RPL13;RPS20;RPS11;RPL39;RPS13;RPS23 |
| DEG_down_cluster_8 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 21/124 | 4.0626169979302e-22 | RPL30;RPL32;RPS8;RPL34;RPL22;RPLP0;RPL11;RPL35A;RPL8;RPL9;RPL7;RPS28;RPS16;RPL18A;RPS18;RPL13;RPS20;RPS11;RPL39;RPS13;RPS23 |
| DEG_down_cluster_9 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 128/1387 | 2.3425928880554e-84 | RPL4;EIF4A2;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;HNRNPU;RPL8;RPL10A;RPL9;RPL6;RPL7;RPS15;RPS14;PNN;RPS16;RPL18A;RPS19;RPS18;RPL36;RPL35;ARL6IP4;RPL38;RPL37;RPS11;RPL39;RPS10;RPS13;DDX17;SRRM2;PNISR;RPS9;RPL21;RPS7;RPS8;RPL23;RPS5;RPL22;RPS6;RPSA;EEF1A1;NCL;RPL24;SRSF3;RPL27;RPL26;SRSF5;RPL29;RPL28;SRSF7;PPIA;SF1;DDX5;RPS4Y1;CORO1A;ZFP36L2;ZFP36L1;HNRNPDL;PCBP2;BTF3;HSPA8;RPL41;JUN;FUS;EEF2;RPS26;RPS25;RPS28;RPS27;RPL27A;RPS20;CALR;RPS21;RPS24;RPS23;HSP90AB1;RPLP0;RPS4X;RBM3;RPL7A;C1QBP;RACK1;HSP90AA1;RPL13A;PRPF40A;RPS3A;EDF1;NSA2;HNRNPH1;EEF1D;RPL37A;ARHGEF1;PABPC1;RPL10;RPL12;SF3B6;RPL11;RPL36A;RPS15A;RPL14;RPS3;RPL13;RPS2;RPL15;RPS27A;RPL18;HNRNPA1;RPL17;EIF4B;RPL19;RBM39;NPM1;HNRNPA3;CNBP;RPL35A;RPL23A;EIF3L;EIF3G;EIF3H;RPL22L1;EIF3E;FAU;VIM;NOP53;TPT1;RAN |
| DEG_down_cluster_9 | GO_Biological_Process_2018 | SRP-dependent cotranslational protein targeting to membrane (GO:0006614) | 77/89 | 2.07279641773225e-138 | RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;RPLP1;RPLP0;RPL10A;RPL8;RPL9;RPL6;RPL7;RPS4X;RPS15;RPL7A;RPS14;RPS16;RPL18A;RPS19;RPS18;RPL36;RPL35;RPLP2;RPL38;RPL37;RPS11;RPL39;RPS10;RPS13;RPS9;RPL21;RPS7;RPS8;RPL23;RPS5;RPL22;RPS6;RPL13A;RPS3A;RPSA;RPL37A;RPL24;RPL27;RPL26;RPL29;RPL28;UBA52;RPL10;RPL12;RPL11;RPL36A;RPS4Y1;RPS15A;RPL14;RPS3;RPL13;RPL15;RPS2;RPL18;RPS27A;RPL17;RPL19;RPL41;RPL35A;RPL23A;RPS26;RPS25;RPS28;RPS27;RPS29;RPL27A;RPS20;RPS21;RPS24;RPS23 |
| DEG_down_cluster_9 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 79/124 | 4.91017505313315e-124 | RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;RPLP1;RPLP0;RPL8;RPL10A;RPL9;RPL6;RPL7;RPS15;RPS4X;RPS14;RPL7A;RPS16;RPS19;RPL18A;RPL36AL;RPS18;RPL36;RACK1;RPL35;RPLP2;RPL38;RPL37;RPS11;RPL39;RPS10;RPS13;RPS9;RPL21;RPS7;RPS8;RPL23;RPS5;RPL22;RPS6;RPL13A;RPSA;RPS3A;RPL37A;RPL24;RPL27;RPL26;RPL29;RPL28;RPL10;RPL12;RPL11;RPL36A;RPS4Y1;RPS15A;RPL14;RPS3;RPL13;RPL15;RPS2;RPL18;RPS27A;RPL17;RPL19;RPL41;RPL35A;RPL23A;RPS26;RPS25;RPS28;RPS27;RPS29;RPL27A;RPS20;RPL22L1;RPS21;RPS24;RPS23 |
We export the UMAP 2D visualisation, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser.
# Export UMAP coordinates
loupe_umap = as.data.frame(sc@reductions$umap@cell.embeddings)
loupe_umap$Barcode = gsub(pattern="_", replacement="-", x=rownames(loupe_umap), fixed=TRUE)
if (length(grep(pattern="-$", x=loupe_umap$Barcode, perl=TRUE))==0) loupe_umap$Barcode=paste0(loupe_umap$Barcode, "-1")
loupe_umap = loupe_umap[, c("Barcode", "UMAP_1", "UMAP_2")]
colnames(loupe_umap) = c("Barcode", "UMAP-1", "UMAP-2")
write.table(loupe_umap, file=paste0(param$path_out, "/Seurat2Loupe_Umap.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export categorical metadata
meta_to_export = c("orig.ident", "seurat_clusters", "Phase")
# if (run.hto) meta.to.export = c(meta.to.export, "HTO_maxID", "HTO_classification", "HTO_classification.global", "hash.ID")
loupe_meta = as.data.frame(sc@meta.data[,meta_to_export])
loupe_meta = cbind(Barcode=gsub(pattern="_", replacement="-", rownames(loupe_meta), fixed=TRUE), loupe_meta)
if (length(grep(pattern="-$", x=loupe_meta$Barcode, perl=TRUE))==0) loupe_meta$Barcode=paste0(loupe_meta$Barcode, "-1")
write.table(x=loupe_meta, file=paste0(param$path_out, "/Seurat2Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export gene sets
loupe_genesets = data.frame(List=paste0("DEG_up_cluster_", sc_markers_filt_up[,"cluster"]),
Name=sc_markers_filt_up[,"gene"],
Ensembl=seurat_rowname_to_ensembl[sc_markers_filt_up[,"gene"]])
loupe_genesets = rbind(loupe_genesets,
data.frame(List=paste0("DEG_down_cluster_", sc_markers_filt_down[,"cluster"]),
Name=sc_markers_filt_down[,"gene"],
Ensembl=seurat_rowname_to_ensembl[sc_markers_filt_down[,"gene"]]))
genesets_to_export = list(genes_cc_s_phase=genes_s[,2], genes_cc_g2m_phase=genes_g2m[,2])
for (i in names(genesets_to_export)) {
tmp_genes = genesets_to_export[[i]]
tmp_genes = tmp_genes[tmp_genes %in% names(symbol_to_ensembl)]
loupe_genesets = rbind(loupe_genesets,
data.frame(List=i,
Name=tmp_genes,
Ensembl=seurat_rowname_to_ensembl[tmp_genes]))
}
write.table(loupe_genesets, file=paste0(param$path_out, "/Seurat2Loupe_genesets.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")All files generated with this report are written into the provided output folder test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/results:
This report was generated using the scrnaseq GitHub repository. Software versions were collected at run time.
out = scrnaseq_session_info()
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| Name | Version |
|---|---|
| ktrns/scrnaseq | fbef7f2f35eb0917562bf1f774695657332fcb3c |
| R | R version 3.6.1 (2019-07-05) |
| Platform | x86_64-apple-darwin15.6.0 (64-bit) |
| Operating system | macOS Mojave 10.14.6 |
| Packages | abind1.4-5, AnnotationDbi1.46.1, ape5.3, assertthat0.2.1, backports1.1.5, bibtex0.4.2, Biobase2.44.0, BiocGenerics0.30.0, BiocParallel1.18.1, biomaRt2.40.5, bit1.1-14, bit640.9-7, bitops1.0-6, blme1.0-4, blob1.2.0, boot1.3-23, caTools1.17.1.3, cli2.0.2, cluster2.1.0, codetools0.2-16, colorspace1.4-1, cowplot1.0.0, crayon1.3.4, curl4.3, data.table1.12.6, DBI1.0.0, DelayedArray0.10.0, digest0.6.25, dplyr0.8.3, enrichR2.1, evaluate0.14, fansi0.4.0, farver2.0.1, fitdistrplus1.0-14, future1.15.1, future.apply1.3.0, gdata2.18.0, GenomeInfoDb1.20.0, GenomeInfoDbData1.2.1, GenomicRanges1.36.1, ggplot23.2.1, ggrepel0.8.1, ggridges0.5.1, globals0.12.5, glue1.4.1, gplots3.0.1.1, gridExtra2.3, gtable0.3.0, gtools3.8.1, highr0.8, hms0.5.2, htmltools0.4.0, htmlwidgets1.5.1, httr1.4.1, ica1.0-2, igraph1.2.4.2, IRanges2.18.3, irlba2.3.3, jsonlite1.6.1, kableExtra1.1.0, KernSmooth2.23-16, knitcitations1.0.10, knitr1.26, labeling0.3, lattice0.20-38, lazyeval0.2.2, leiden0.3.1, lifecycle0.1.0, listenv0.8.0, lme41.1-21, lmtest0.9-37, lsei1.2-0, lubridate1.7.4, magrittr1.5, MASS7.3-51.4, Matrix1.2-18, matrixStats0.55.0, memoise1.1.0, minqa1.2.4, munsell0.5.0, nlme3.1-142, nloptr1.2.1, npsurv0.4-0, openxlsx4.1.4, patchwork1.0.0, pbapply1.4-2, pillar1.4.2, pkgconfig2.0.3, plotly4.9.1, plyr1.8.4, png0.1-7, prettyunits1.0.2, progress1.2.2, purrr0.3.3, R62.4.1, RANN2.6.1, RColorBrewer1.1-2, Rcpp1.0.3, RcppAnnoy0.0.14, RcppParallel4.4.4, RCurl1.95-4.12, readr1.3.1, RefManageR1.2.12, reshape21.4.3, reticulate1.13, rjson0.2.20, rlang0.4.6, rmarkdown1.18, ROCR1.0-7, RSpectra0.16-0, RSQLite2.1.4, rstudioapi0.11, rsvd1.0.2, Rtsne0.15, rvest0.3.5, S4Vectors0.22.1, scales1.1.0, sctransform0.2.0, sessioninfo1.1.1, Seurat3.1.5, SingleCellExperiment1.6.0, stringi1.4.3, stringr1.4.0, SummarizedExperiment1.14.1, survival3.1-8, tibble2.1.3, tidyr1.0.0, tidyselect0.2.5, tsne0.1-3, uwot0.1.5, vctrs0.2.0, viridisLite0.3.0, webshot0.5.2, withr2.1.2, xfun0.11, XML3.98-1.20, xml21.2.2, XVector0.24.0, yaml2.2.0, zeallot0.1.0, zip2.0.4, zlibbioc1.30.0, zoo1.8-6 |
Chen, Edward Y. 2020. “Enrichr.” . https://amp.pharm.mssm.edu/Enrichr/.
Gandolfo, Luke C., and Terence P. Speed. 2018. “RLE Plots: Visualizing Unwanted Variation in High Dimensional Data.” Edited by Enrique Hernandez-Lemus. PLOS ONE 13 (2). Public Library of Science (PLoS): e0191629. https://doi.org/10.1371/journal.pone.0191629.
Tirosh, I., B. Izar, S. M. Prakadan, M. H. Wadsworth, D. Treacy, J. J. Trombetta, A. Rotem, et al. 2016. “Dissecting the Multicellular Ecosystem of Metastatic Melanoma by Single-Cell RNA-Seq.” Science 352 (6282). American Association for the Advancement of Science (AAAS): 189–96. https://doi.org/10.1126/science.aad0501.
“Understanding Umap.” 2019. https://pair-code.github.io/understanding-umap/. https://pair-code.github.io/understanding-umap/.